# sgemm_common_128x128

# Copyright 2014 Nervana Systems Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#    http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


-:-:-:-:00 LDS.U.128 j0Ay0, [readAs + 4x<0*128 + 00>];
-:-:-:-:00 LDS.U.128 j0Bx0, [readBs + 4x<0*128 + 00>];
-:-:-:-:00 LDS.U.128 j0Ay4, [readAs + 4x<0*128 + 64>];
-:-:-:-:00 LDS.U.128 j0Bx4, [readBs + 4x<0*128 + 64>];
//-:-:-:-:00 NOP;

LOOP:

<CODE>

    our @top;
    our %insert;

    my @cOrder;
    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
    my @y = (0,1,4,5);
    #push @cOrder, [0,0];
    #push @cOrder, [0,1];
    #push @cOrder, [0,2];
    #push @cOrder, [1,2];
    #push @cOrder, [1,1];
    #push @cOrder, [1,0];
    #push @cOrder, [2,0];
    #push @cOrder, [2,1];
    #push @cOrder, [2,2];
    #push @cOrder, [3,2];
    #push @cOrder, [3,1];
    #push @cOrder, [3,0];
    #push @cOrder, [4,0];
    #push @cOrder, [4,1];
    #push @cOrder, [4,2];
    #push @cOrder, [5,2];
    #push @cOrder, [5,1];
    #push @cOrder, [5,0];
    #push @cOrder, [6,0];
    #push @cOrder, [6,1];
    #push @cOrder, [6,2];
    #push @cOrder, [7,2];
    #push @cOrder, [7,1];
    #push @cOrder, [7,0];
    #push @cOrder, [7,3];
    #push @cOrder, [7,4];
    #push @cOrder, [7,5];
    #push @cOrder, [6,5];
    #push @cOrder, [6,4];
    #push @cOrder, [6,3];
    #push @cOrder, [5,3];
    #push @cOrder, [5,4];
    #push @cOrder, [5,5];
    #push @cOrder, [4,5];
    #push @cOrder, [4,4];
    #push @cOrder, [4,3];
    #push @cOrder, [3,3];
    #push @cOrder, [3,4];
    #push @cOrder, [3,5];
    #push @cOrder, [2,5];
    #push @cOrder, [2,4];
    #push @cOrder, [2,3];
    #push @cOrder, [1,3];
    #push @cOrder, [1,4];
    #push @cOrder, [1,5];
    #push @cOrder, [0,5];
    #push @cOrder, [0,4];
    #push @cOrder, [0,3];
    #push @cOrder, [0,6];
    #push @cOrder, [1,6];
    #push @cOrder, [2,6];
    #push @cOrder, [2,7];
    #push @cOrder, [1,7];
    #push @cOrder, [0,7];
    #push @cOrder, [3,7];
    #push @cOrder, [4,7];
    #push @cOrder, [5,7];
    #push @cOrder, [5,6];
    #push @cOrder, [4,6];
    #push @cOrder, [3,6];
    #push @cOrder, [6,6];
    #push @cOrder, [6,7];
    #push @cOrder, [7,7];
    #push @cOrder, [7,6];
#cublas:
    #push  @cOrder, [0,3];
    #push  @cOrder, [0,4];
    #push  @cOrder, [1,4];
    #push  @cOrder, [0,5];
    #push  @cOrder, [1,5];
    #push  @cOrder, [2,0];
    #push  @cOrder, [2,1];
    #push  @cOrder, [3,1];
    #push  @cOrder, [4,0];
    #push  @cOrder, [3,0];
    #push  @cOrder, [4,1];
    #push  @cOrder, [1,7];
    #push  @cOrder, [0,7];
    #push  @cOrder, [0,6];
    #push  @cOrder, [1,3];
    #push  @cOrder, [1,6];
    #push  @cOrder, [4,3];
    #push  @cOrder, [4,2];
    #push  @cOrder, [3,2];
    #push  @cOrder, [2,3];
    #push  @cOrder, [3,3];
    #push  @cOrder, [2,2];
    #push  @cOrder, [2,4];
    #push  @cOrder, [2,5];
    #push  @cOrder, [3,5];
    #push  @cOrder, [4,4];
    #push  @cOrder, [3,4];
    #push  @cOrder, [,];
    #push  @cOrder, [,];
    #push  @cOrder, [,];
    #push  @cOrder, [,];
    #push  @cOrder, [,];
    #push  @cOrder, [,];
    #push  @cOrder, [,];
    #push  @cOrder, [,];
    #push  @cOrder, [,];
    #push  @cOrder, [,];
    #push  @cOrder, [,];
    #push  @cOrder, [,];
    #push  @cOrder, [,];
    #push  @cOrder, [,];
    #push  @cOrder, [,];
    #push  @cOrder, [,];
    #push  @cOrder, [,];
    #push  @cOrder, [,];
    #push  @cOrder, [,];
    #push  @cOrder, [,];
    #push  @cOrder, [,];
    #push  @cOrder, [,];
    #push  @cOrder, [,];
    #push  @cOrder, [,];
    #push  @cOrder, [,];
    #push  @cOrder, [,];
    #push  @cOrder, [,];
    #push  @cOrder, [,];
    #push  @cOrder, [,];
    #push  @cOrder, [,];
    #push  @cOrder, [,];
    #push  @cOrder, [,];
    #push  @cOrder, [,];
    #push  @cOrder, [,];
    #push  @cOrder, [,];
    #push  @cOrder, [,];
    #push  @cOrder, [,];
#xsb:
    push  @cOrder, [0,0];
    push  @cOrder, [0,1];
    push  @cOrder, [1,1];
    push  @cOrder, [2,0];
    push  @cOrder, [1,0];
    push  @cOrder, [2,1];
    push  @cOrder, [2,3];
    push  @cOrder, [2,2];
    push  @cOrder, [1,2];
    push  @cOrder, [0,3];
    push  @cOrder, [1,3];
    push  @cOrder, [0,2];
    push  @cOrder, [0,4];
    push  @cOrder, [0,5];
    push  @cOrder, [1,5];
    push  @cOrder, [2,4];
    push  @cOrder, [1,4];
    push  @cOrder, [2,5];
    push  @cOrder, [2,7];
    push  @cOrder, [2,6];
    push  @cOrder, [1,6];
    push  @cOrder, [0,7];
    push  @cOrder, [1,7];
    push  @cOrder, [0,6];
    push  @cOrder, [3,6];
    push  @cOrder, [3,7];
    push  @cOrder, [4,7];
    push  @cOrder, [5,6];
    push  @cOrder, [4,6];
    push  @cOrder, [5,7];
    push  @cOrder, [5,5];
    push  @cOrder, [5,4];
    push  @cOrder, [4,4];
    push  @cOrder, [3,5];
    push  @cOrder, [4,5];
    push  @cOrder, [3,4];
    push  @cOrder, [3,2];
    push  @cOrder, [3,3];
    push  @cOrder, [4,3];
    push  @cOrder, [5,2];
    push  @cOrder, [4,2];
    push  @cOrder, [5,3];
    push  @cOrder, [5,1];
    push  @cOrder, [5,0];
    push  @cOrder, [4,0];
    push  @cOrder, [3,1];
    push  @cOrder, [4,1];
    push  @cOrder, [3,0];
    push  @cOrder, [6,0];
    push  @cOrder, [6,1];
    push  @cOrder, [7,1];
    push  @cOrder, [6,2];
    push  @cOrder, [7,2];
    push  @cOrder, [6,3];
    push  @cOrder, [6,4];
    push  @cOrder, [6,5];
    push  @cOrder, [7,5];
    push  @cOrder, [6,6];
    push  @cOrder, [7,6];
    push  @cOrder, [6,7];
    push  @cOrder, [7,7];
    push  @cOrder, [7,4];
    push  @cOrder, [7,3];
    push  @cOrder, [7,0];
#    foreach my $x (0,2,4,6)
#    {
#        foreach my $y (@y)
#        {
#            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
#        }
#        @y = reverse @y;
#    }

    my $out = join '', @top;
    my $dualed = 0;
    foreach my $j (0 .. 7)
    {
        my $odd      = $j & 1;
        my $nOdd     = !$odd + 0;
        my $rsOffset = ($j + 1) % 8;
        my $rsPred   = $j == 7 ? '@P0' : '   ';

if(0){
#$insert{"j${j}c5"} = sprintf "-:-:-:-:00 %s LDS.128 j%dAy0, [readAs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
#$insert{"j${j}c10"} = sprintf "-:-:-:-:00 %s LDS.128 j%dBx0, [readBs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
#$insert{"j${j}c9"} = sprintf "-:-:-:-:00 %s LDS.64 j%dAy4, [readAs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset;
#$insert{"j${j}c15"} = sprintf "-:-:-:-:00 %s LDS.64 j%dBx4, [readBs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset;

#$insert{"j${j}c5"}  = sprintf "-:-:-:-:00 %s LDS.64 j%dAy0, [readAs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
#$insert{"j${j}c11"} = sprintf "-:-:-:-:00 %s LDS.64 j%dBx0, [readBs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
#$insert{"j${j}c23"} = sprintf "-:-:-:-:00 %s LDS.64 j%dAy2, [readAs + 4x<%d*128 + 32>];\n", $rsPred, $nOdd, $rsOffset;
#$insert{"j${j}c29"} = sprintf "-:-:-:-:00 %s LDS.64 j%dBx2, [readBs + 4x<%d*128 + 32>];\n", $rsPred, $nOdd, $rsOffset;
#$insert{"j${j}c35"} = sprintf "-:-:-:-:00 %s LDS.64 j%dAy4, [readAs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset;
#$insert{"j${j}c41"} = sprintf "-:-:-:-:00 %s LDS.64 j%dBx4, [readBs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset;
#$insert{"j${j}c47"} = sprintf "-:-:-:-:00 %s LDS.64 j%dAy6, [readAs + 4x<%d*128 + 96>];\n", $rsPred, $nOdd, $rsOffset;
#$insert{"j${j}c53"} = sprintf "-:-:D:-:00 %s LDS.64 j%dBx6, [readBs + 4x<%d*128 + 96>];\n", $rsPred, $nOdd, $rsOffset;
}
else{
#$insert{"j${j}c1"} = sprintf "-:-:-:-:00 NOP;\n",;
#$insert{"j${j}c3"} = sprintf "-:-:-:-:00 NOP;\n",;
#$insert{"j${j}c5"} = sprintf "-:-:-:-:00 NOP;\n",;
#$insert{"j${j}c7"} = sprintf "-:-:-:-:00 NOP;\n",;

$insert{"j${j}c23"} = sprintf "T:-:D:S:00 %s LDS.64 j%dAy0, [readAs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
$insert{"j${j}c5"}  = sprintf "T:-:D:S:00 %s LDS.64 j%dBx0, [readBs + 4x<%d*128 + 00>];\n", $rsPred, $nOdd, $rsOffset;
$insert{"j${j}c35"} = sprintf "T:-:D:S:00 %s LDS.64 j%dAy2, [readAs + 4x<%d*128 + 2>];\n", $rsPred, $nOdd, $rsOffset;
$insert{"j${j}c11"} = sprintf "T:-:D:S:00 %s LDS.64 j%dBx2, [readBs + 4x<%d*128 + 2>];\n", $rsPred, $nOdd, $rsOffset;
$insert{"j${j}c41"} = sprintf "T:-:D:S:00 %s LDS.64 j%dAy4, [readAs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset;
$insert{"j${j}c17"} = sprintf "T:-:D:S:00 %s LDS.64 j%dBx4, [readBs + 4x<%d*128 + 64>];\n", $rsPred, $nOdd, $rsOffset;
$insert{"j${j}c47"} = sprintf "T:-:D:S:00 %s LDS.64 j%dAy6, [readAs + 4x<%d*128 + 66>];\n", $rsPred, $nOdd, $rsOffset;
$insert{"j${j}c59"} = sprintf "T:-:D:S:00 %s LDS.64 j%dBx6, [readBs + 4x<%d*128 + 66>];\n", $rsPred, $nOdd, $rsOffset;

}
        foreach my $c (0 .. 63)
        {
            my ($x,$y) = @{$cOrder[$c]};
            #my $x;
            #my $y;
 #           if(($c -2) % 6 == 0){
 #             ($x,$y) = @{$cOrder[$c+1]};
 #           }
 #           else{ 
 #             if(($c -3) % 6 == 0){
 #               ($x,$y) = @{$cOrder[$c-1]};
 #             }
 #           }
            my $ins    = $insert{"j${j}c$c"} || '';
            if(($c - 5) % 6 ==0 || $c > 60){
              if($ins){
              }
              else{
                $ins = "-:G:D:-:00 NOP;\n";
              }
            }
            my $wait   = $c == 0 ? '01' : '--';

            my $stall  = (split "\n", $ins)[0] =~ /LDS|F2F|I2I|LDG|STS|BAR|BRA/ ? 0 : 1;

            my $yield  = $c == 32 && $stall ? 'Y' : '-';
my $ctrl;
if($ins){
  $ctrl = "-:-:D:-:04";
}
else{
  if(($c - 1) % 6 == 0 || ($c - 3) % 6 == 0){
    $ctrl = "-:-:D:-:04";
  }
  else{
    $ctrl = "-:-:D:-:05";
  }
}
            $out .= sprintf "%s      FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
            #print $out;
            #print "\n";
        }
    }
    return $out;

</CODE>

-:-:-:-:00 IADD loop, loop, 1;
-:-:-:-:00 IADD ta, ta, param_ldaz;
-:-:-:-:00 IADD tb, tb, param_ldbz;
-:-:-:-:00 MOV k, param_k;
-:-:-:-:00 ISETP.LT.AND P1, PT, loop, param_loops, PT;
//-:-:-:-:00 LEA trackA0.CC, ta, param_A[0], 2;
//-:-:-:-:00 LEA.HI.X trackA1, ta, param_A[1], RZ, 2;
//-:-:-:-:00 LEA trackB0.CC, tb, param_B[0], 2;
//-:-:-:-:00 LEA.HI.X trackB1, tb, param_B[1], RZ, 2;
-:-:-:-:00 MOV param_A_base_lo, param_A[0];
-:-:-:-:00 MOV param_A_base_hi, param_A[1];
-:-:-:-:00 SHL ta_shl, ta, 2;
-:-:-:-:00 IADD trackA0.CC, ta_shl, param_A_base_lo;
-:-:-:-:00 IADD.X trackA1, RZ, param_A_base_hi;

-:-:-:-:00 MOV param_B_base_lo, param_B[0];
-:-:-:-:00 MOV param_B_base_hi, param_B[1];
-:-:-:-:00 SHL tb_shl, tb, 2;
-:-:-:-:00 IADD trackB0.CC, tb_shl, param_B_base_lo;
-:-:-:-:00 IADD.X trackB1, RZ, param_B_base_hi;
//loop iteration
-:-:-:-:00 @P1 BRA.U REMAINDER;

-:-:-:-:00 S2R blockA, SR_CTAID.Y;
-:-:-:-:00 S2R blockB, SR_CTAID.Z;
-:-:-:-:00 S2R blockZ, SR_CTAID.X;
//<SCHEDULE_BLOCK>

-:-:-:-:00 LOP.AND tid_31, tid, 31;
-:-:-:-:00 LOP.AND tid_96, tid, 96;
-:-:-:-:00 LOP.AND tid_128, tid, 128;

// writeCs = (readAs / 4) * 128 + readBs;
-:-:-:-:00 LOP.AND readAs, readAs, 0xfff;
-:-:-:-:00 LOP.AND readBs, readBs, 0xfff;
-:-:-:-:00 ISCADD writeCs, readAs, readBs, 5;

// cx = tid_31 | (tid_128 >> 2);
-:-:-:-:00 SHR.U32 cx00, tid_128, 2;
-:-:-:-:00 LOP.OR cx00, tid_31, cx00;

// readCs = ((tid_96 << 4) | cx) << 2;
-:-:-:-:00 SHL readCs, tid_96, 4;
-:-:-:-:00 LOP.OR readCs, readCs, cx00;
-:-:-:-:00 SHL readCs, readCs, 2;

// cx += blockB*128;
-:-:-:-:00 ISCADD cx00, blockB, cx00, 7;
-:-:-:-:00 IADD cx64, cx00, 64;

// cy = blockA*128 + (tid_96 >> 1)
-:-:-:-:00 SHR.U32 cy00, tid_96, 1;
//cy00 = blockA*128 + cy00
-:-:-:-:00 ISCADD cy00, blockA, cy00, 7;

// C += (ldcz*blockZ + ldc*cy + cx00) * 4;
-:-:-:-:00 MOV ldc, param_ldc;
-:-:-:-:00 MOV ldcz, param_ldcz;
//ci = ldc*cy00 + cx00
//-:-:-:-:00 XMAD.LO ci, ldc, cy00, cx00, xmad_c;
//-:-:-:-:00 XMAD.LO2 ci, ldcz, blockZ, ci;
-:-:-:-:00 IMAD ci, ldc, cy00, cx00;
-:-:-:-:00 IMAD ci, ldcz, blockZ, ci;
//-:-:-:-:00 LEA C00y0.CC, ci, param_C[0], 2;
//-:-:-:-:00 LEA.HI.X C00y1, ci, param_C[1], RZ, 2;

-:-:-:-:00 MOV param_A_base_lo, param_C[0];
-:-:-:-:00 MOV param_A_base_hi, param_C[1];
//ta_shl = ta << 2 =ta*4
-:-:-:-:00 SHL ta_shl, ci, 2;
//trackA0 = ta_shl + param_A[0]
-:-:-:-:00 IADD C00y0.CC, ta_shl, param_A_base_lo;
-:-:-:-:00 IADD.X C00y1, RZ, param_A_base_hi;

-:-:-:-:00 SHL ldc1, ldc, 2;
-:-:-:-:00 SHL ldc4, ldc, 4;
-:-:-:-:00 ISCADD ldc60, ldc, -ldc4, 8;

-:-:-:-:00 MOV alpha, param_alpha;
-:-:-:-:00 MOV beta, param_beta;
-:-:-:-:00 MOV flags, param_flags;

// Apply beta
-:-:-:-:00 ISETP.NE.AND P6, PT, beta, RZ, PT;

// Seed the Tausworthe
-:-:-:-:00 LOP.XOR lfsr0, seed, cx00;
-:-:-:-:00 S2R lfsr1, SR_CLOCKLO;
-:-:-:-:00 S2R lfsr2, SR_GLOBALTIMERLO;
-:-:-:-:00 LOP.AND clk_shf1, lfsr1, 31;
-:-:-:-:00 LOP.AND clk_shf2, lfsr2, 31;
-:-:-:-:00 LOP.XOR clk_shf1, clk_shf1, tid_31;
-:-:-:-:00 LOP.XOR clk_shf2, clk_shf2, tid_31;
-:-:-:-:00 SHF.R.U64 lfsr1, lfsr1, clk_shf1, lfsr1;
-:-:-:-:00 SHF.R.U64 lfsr2, lfsr2, clk_shf2, lfsr2;

// Find the exponent that will shift 32 bits of integer data 
// out past the lsb of this number with the requested mantissa.
// exp_scale = (127 - 32 - (flags>>16)) << 23
-:-:-:-:00 SHR.U32 mantissa_bits, flags, 16;
-:-:-:-:00 IADD exp_scale, -mantissa_bits, 95;
-:-:-:-:00 SHL exp_scale, exp_scale, 23;

// Truncation mask used after round amount is added.
// trunc_mask = 0xffffffff << (23 - mantissa_bits)
-:-:-:-:00 IADD mantissa_bits, -mantissa_bits, 23;
-:-:-:-:00 MOV32I trunc_mask, -1;
//-:-:-:-:00 MOV trunc_mask, -1;
-:-:-:-:00 SHL trunc_mask, trunc_mask, mantissa_bits;

//</SCHEDULE_BLOCK>

-:-:-:-:00 IADD C04y0.CC, C00y0, ldc4;
-:-:-:-:00 MOV d0, RZ;
-:-:-:-:00 IADD cy04, cy00, 4;
-:-:-:-:00 IADD.X C04y1, C00y1, RZ;
-:-:-:-:00 IADD C08y0.CC, C04y0, ldc4;
-:-:-:-:00 MOV d1, RZ;
-:-:-:-:00 IADD cy08, cy00, 8;
-:-:-:-:00 IADD.X C08y1, C04y1, RZ;
-:-:-:-:00 IADD C12y0.CC, C08y0, ldc4;
-:-:-:-:00 MOV d2, RZ;
-:-:-:-:00 MOV d3, RZ;
-:-:-:-:00 IADD cy12, cy00, 12;
-:-:-:-:00 IADD.X C12y1, C08y1, RZ;

-:-:-:-:00 BAR.SYNC 0;

<CODE>

    my $out;
    foreach my $y (0..7)
    {
        $out .=
"-:-:-:-:00 IADD C00y0.CC, C00y0, ldc60;\n" .
"-:-:-:-:00 IADD cy00, cy00, 60;\n" .
"-:-:-:-:00 IADD.X C00y1, C00y1, RZ;\n" .
"-:-:-:-:00 IADD C04y0.CC, C04y0, ldc60;\n" .
"-:-:-:-:00 IADD cy04, cy04, 60;\n" .
"-:-:-:-:00 IADD.X C04y1, C04y1, RZ;\n" .
"-:-:-:-:00 IADD C08y0.CC, C08y0, ldc60;\n" .
"-:-:-:-:00 IADD cy08, cy08, 60;\n" .
"-:-:-:-:00 IADD.X C08y1, C08y1, RZ;\n" .
"-:-:-:-:00 IADD C12y0.CC, C12y0, ldc60;\n" .
"-:-:-:-:00 IADD cy12, cy12, 60;\n" .
"-:-:-:-:00 IADD.X C12y1, C12y1, RZ;\n\n" if $y == 4;

        $out .= sprintf(
"-:-:-:-:00 FMUL c0, cx0y%d, alpha;\n" .
"-:-:-:-:00 FMUL c1, cx1y%d, alpha;\n" .
"-:-:-:-:00 FMUL c2, cx2y%d, alpha;\n" .
"-:-:-:-:00 FMUL c3, cx3y%d, alpha;\n" .
"-:-:-:-:00 FMUL c4, cx4y%d, alpha;\n" .
"-:-:-:-:00 FMUL c5, cx5y%d, alpha;\n" .
"-:-:-:-:00 FMUL c6, cx6y%d, alpha;\n" .
"-:-:-:-:00 FMUL c7, cx7y%d, alpha;\n",
            ($y) x 8);

$out .= "-:-:-:-:00 CAL STORE_C;\n\n";
    }
    return $out;

</CODE>

-:-:-:-:00 @P4 ST.E.CS [Rand], seed;
-:-:-:-:00 EXIT;

STORE_C:

//<SCHEDULE_BLOCK>
-:-:-:-:00 ISETP.LT.AND P4, PT, cx00, param_n, P6;
-:-:-:-:00 ISETP.LT.AND P5, PT, cx64, param_n, P6;

-:-:-:-:00 ISETP.LT.AND P0, PT, cy00, param_m, P4;
-:-:-:-:00 ISETP.LT.AND P1, PT, cy00, param_m, P5;
-:-:-:-:00 ISETP.LT.AND P2, PT, cy04, param_m, P4;
-:-:-:-:00 ISETP.LT.AND P3, PT, cy04, param_m, P5;

-:-:-:-:00 @P0 LD.E d0, [C00y + 4x<00>];
-:-:-:-:00 @P1 LD.E d1, [C00y + 4x<64>];
-:-:-:-:00 @P2 LD.E d2, [C04y + 4x<00>];
-:-:-:-:00 @P3 LD.E d3, [C04y + 4x<64>];

-:-:-:-:00 ISETP.LT.AND P4, PT, cx00, param_n, PT;
-:-:-:-:00 ISETP.LT.AND P5, PT, cx64, param_n, PT;

-:-:-:-:00 ISETP.LT.AND P0, PT, cy00, param_m, P4;
-:-:-:-:00 ISETP.LT.AND P1, PT, cy00, param_m, P5;
-:-:-:-:00 ISETP.LT.AND P2, PT, cy04, param_m, P4;
-:-:-:-:00 ISETP.LT.AND P3, PT, cy04, param_m, P5;
-:-:-:-:00 IADD cy00, cy00, 1;
-:-:-:-:00 IADD cy04, cy04, 1;

// Apply relu
//-:-:-:-:00 LOP.AND.NZ P6, RZ, flags, 2;
//set P6 as false
-:-:-:-:00 ISETP.LT.AND P6, PT, RZ, RZ, !PT;
-:-:-:-:00 @P6 FMNMX c0, c0, RZ, !PT;
-:-:-:-:00 @P6 FMNMX c1, c1, RZ, !PT;
-:-:-:-:00 @P6 FMNMX c2, c2, RZ, !PT;
-:-:-:-:00 @P6 FMNMX c3, c3, RZ, !PT;
-:-:-:-:00 @P6 FMNMX c4, c4, RZ, !PT;
-:-:-:-:00 @P6 FMNMX c5, c5, RZ, !PT;
-:-:-:-:00 @P6 FMNMX c6, c6, RZ, !PT;
-:-:-:-:00 @P6 FMNMX c7, c7, RZ, !PT;

// beta != 0
//-:-:-:-:00 ISETP.NE.AND P6, PT, beta, RZ, PT;
//set P6 as false
-:-:-:-:00 ISETP.LT.AND P6, PT, RZ, RZ, !PT;

//<ORDERED>
-:-:-:-:00 STS.128 [writeCs+4x<00>], c0;
-:-:-:-:00 STS.128 [writeCs+4x<64>], c4;
-:-:-:-:00 LDS c0, [readCs + 4x<0*128 + 00>];
-:-:-:-:00 LDS c1, [readCs + 4x<0*128 + 64>];

// Stochastic Round flag
//-:-:-:-:00 LOP.AND.NZ P4, RZ, flags, 1;
//set P6 as false
-:-:-:-:00 ISETP.LT.AND P4, PT, RZ, RZ, !PT;

-:-:-:-:00 LDS c2, [readCs + 4x<1*128 + 00>];
-:-:-:-:00 LDS c3, [readCs + 4x<1*128 + 64>];
//</ORDERED>
//</SCHEDULE_BLOCK>

-:-:-:-:00 FFMA c0, d0, beta, c0;
-:-:-:-:00 FFMA c1, d1, beta, c1;
-:-:-:-:00 FFMA c2, d2, beta, c2;
-:-:-:-:00 FFMA c3, d3, beta, c3;

-:-:-:-:00 @!P4 BRA.U END_ROUND1;

//<SCHEDULE_BLOCK>
// Strip mantissa and leave sign+exponent
-:-:-:-:00 LOP32I.AND exp0, c0, 0xff800000;
-:-:-:-:00 LOP32I.AND exp1, c1, 0xff800000;
-:-:-:-:00 LOP32I.AND exp2, c2, 0xff800000;
-:-:-:-:00 LOP32I.AND exp3, c3, 0xff800000;

// Find the exponent that will shift 32 bits of integer data 
// out past the lsb of this number as an fp16
// exp *= 2^-10 * 2^-32  (2^-42)
-:-:-:-:00 FMUL exp0, exp0, exp_scale;
-:-:-:-:00 FMUL exp1, exp1, exp_scale;
-:-:-:-:00 FMUL exp2, exp2, exp_scale;
-:-:-:-:00 FMUL exp3, exp3, exp_scale;

// lfsr0 = ((lfsr0 & 0xfffffffe) << 12) ^ (((lfsr0 << 13) ^ lfsr0) >> 19);
-:-:-:-:00 LOP32I.AND lfsr0_1, lfsr0, 0xfffffffe;
-:-:-:-:00 SHL lfsr0_1, lfsr0_1, 12;
-:-:-:-:00 SHL lfsr0_2, lfsr0, 13;
-:-:-:-:00 LOP.XOR lfsr0_2, lfsr0_2, lfsr0;
-:-:-:-:00 SHR.U32 lfsr0_2, lfsr0_2, 19;
-:-:-:-:00 LOP.XOR lfsr0, lfsr0_1, lfsr0_2;

// lfsr1 = ((lfsr1 & 0xfffffff8) <<  4) ^ (((lfsr1 << 2)  ^ lfsr1) >> 25);
-:-:-:-:00 LOP32I.AND lfsr1_1, lfsr1, 0xfffffff8;
-:-:-:-:00 SHL lfsr1_1, lfsr1_1, 4;
-:-:-:-:00 SHL lfsr1_2, lfsr1, 2;
-:-:-:-:00 LOP.XOR lfsr1_2, lfsr1_2, lfsr1;
-:-:-:-:00 SHR.U32 lfsr1_2, lfsr1_2, 25;
-:-:-:-:00 LOP.XOR lfsr1, lfsr1_1, lfsr1_2;

// lfsr2 = ((lfsr2 & 0xfffffff0) << 11) ^ (((lfsr2 << 3)  ^ lfsr2) >> 11);
-:-:-:-:00 LOP32I.AND lfsr2_1, lfsr2, 0xfffffff0;
-:-:-:-:00 SHL lfsr2_1, lfsr2_1, 11;
-:-:-:-:00 SHL lfsr2_2, lfsr2, 3;
-:-:-:-:00 LOP.XOR lfsr2_2, lfsr2_2, lfsr2;
-:-:-:-:00 SHR.U32 lfsr2_2, lfsr2_2, 11;
-:-:-:-:00 LOP.XOR lfsr2, lfsr2_1, lfsr2_2;

// rand = lfsr0 ^ lfsr1 ^ lfsr2;
// generate 3 other rotations of this rand
-:-:-:-:00 LOP3.LUT seed, lfsr0, lfsr1, lfsr2, 0x96;
-:-:-:-:00 SHF.R.U64 rand1, seed, 4, seed;
-:-:-:-:00 SHF.R.U64 rand2, seed, 8, seed;
-:-:-:-:00 SHF.R.U64 rand3, seed, 12, seed;

// Convert rand to float
-:-:-:-:00 I2F.F32.U32.RZ rand0, seed;
-:-:-:-:00 I2F.F32.U32.RZ rand1, rand1;
-:-:-:-:00 I2F.F32.U32.RZ rand2, rand2;
-:-:-:-:00 I2F.F32.U32.RZ rand3, rand3;

// Scale the random number so msb is one below lsb of fp16
// Add scaled random to number to round
-:-:-:-:00 FFMA.RZ c0, rand0, exp0, c0;
-:-:-:-:00 FFMA.RZ c1, rand1, exp1, c1;
-:-:-:-:00 FFMA.RZ c2, rand2, exp2, c2;
-:-:-:-:00 FFMA.RZ c3, rand3, exp3, c3;

// Truncate number to fp16
-:-:-:-:00 LOP.AND c0, c0, trunc_mask;
-:-:-:-:00 LOP.AND c1, c1, trunc_mask;
-:-:-:-:00 LOP.AND c2, c2, trunc_mask;
-:-:-:-:00 LOP.AND c3, c3, trunc_mask;
//</SCHEDULE_BLOCK>

END_ROUND1:

//<SCHEDULE_BLOCK>
-:-:-:-:00 ISETP.LT.AND P4, PT, cx00, param_n, P6;
-:-:-:-:00 ISETP.LT.AND P5, PT, cx64, param_n, P6;

// Stochastic Round flag
//-:-:-:-:00 LOP.AND.NZ P6, RZ, flags, 1;
//set P6 as false
-:-:-:-:00 ISETP.LT.AND P6, PT, RZ, RZ, !PT;

-:-:-:-:00 @P0 ST.E.CG [C00y0 + 4x<00>], c0;
-:-:-:-:00 @P1 ST.E.CG [C00y0 + 4x<64>], c1;
-:-:-:-:00 @P2 ST.E.CG [C04y0 + 4x<00>], c2;
-:-:-:-:00 @P3 ST.E.CG [C04y0 + 4x<64>], c3;

-:-:-:-:00 ISETP.LT.AND P0, PT, cy08, param_m, P4;
-:-:-:-:00 ISETP.LT.AND P1, PT, cy08, param_m, P5;
-:-:-:-:00 ISETP.LT.AND P2, PT, cy12, param_m, P4;
-:-:-:-:00 ISETP.LT.AND P3, PT, cy12, param_m, P5;

-:-:-:-:00 @P0 LD.E d0, [C08y0 + 4x<00>];
-:-:-:-:00 @P1 LD.E d1, [C08y0 + 4x<64>];
-:-:-:-:00 @P2 LD.E d2, [C12y0 + 4x<00>];
-:-:-:-:00 @P3 LD.E d3, [C12y0 + 4x<64>];

-:-:-:-:00 ISETP.LT.AND P4, PT, cx00, param_n, PT;
-:-:-:-:00 ISETP.LT.AND P5, PT, cx64, param_n, PT;

-:-:-:-:00 ISETP.LT.AND P0, PT, cy08, param_m, P4;
-:-:-:-:00 ISETP.LT.AND P1, PT, cy08, param_m, P5;
-:-:-:-:00 ISETP.LT.AND P2, PT, cy12, param_m, P4;
-:-:-:-:00 ISETP.LT.AND P3, PT, cy12, param_m, P5;
//</SCHEDULE_BLOCK>

-:-:-:-:00 IADD C00y0.CC, C00y0, ldc1;
-:-:-:-:00 IADD cy08, cy08, 1;
-:-:-:-:00 IADD cy12, cy12, 1;
-:-:-:-:00 IADD.X C00y1, C00y1, RZ;
-:-:-:-:00 IADD C04y0.CC, C04y0, ldc1;
-:-:-:-:00 IADD.X C04y1, C04y1, RZ;

-:-:-:-:00 LDS c0, [readCs + 4x<2*128 + 00>];
-:-:-:-:00 LDS c1, [readCs + 4x<2*128 + 64>];
-:-:-:-:00 LDS c2, [readCs + 4x<3*128 + 00>];
-:-:-:-:00 LDS c3, [readCs + 4x<3*128 + 64>];

-:-:-:-:00 FFMA c0, d0, beta, c0;
-:-:-:-:00 FFMA c1, d1, beta, c1;
-:-:-:-:00 FFMA c2, d2, beta, c2;
-:-:-:-:00 FFMA c3, d3, beta, c3;

-:-:-:-:00 @!P6 BRA.U END_ROUND2;

//<SCHEDULE_BLOCK>
// Strip mantissa and leave sign+exponent
-:-:-:-:00 LOP32I.AND exp0, c0, 0xff800000;
-:-:-:-:00 LOP32I.AND exp1, c1, 0xff800000;
-:-:-:-:00 LOP32I.AND exp2, c2, 0xff800000;
-:-:-:-:00 LOP32I.AND exp3, c3, 0xff800000;

// Find the exponent that will shift 32 bits of integer data 
// out past the lsb of this number as an fp16
// exp *= 2^-10 * 2^-32  (2^-42)
-:-:-:-:00 FMUL exp0, exp0, exp_scale;
-:-:-:-:00 FMUL exp1, exp1, exp_scale;
-:-:-:-:00 FMUL exp2, exp2, exp_scale;
-:-:-:-:00 FMUL exp3, exp3, exp_scale;

// generate 4 other rotations of this seed
-:-:-:-:00 SHF.R.U64 rand0, seed, 16, seed;
-:-:-:-:00 SHF.R.U64 rand1, seed, 20, seed;
-:-:-:-:00 SHF.R.U64 rand2, seed, 24, seed;
-:-:-:-:00 SHF.R.U64 rand3, seed, 28, seed;

// Convert rand to float
-:-:-:-:00 I2F.F32.U32.RZ rand0, rand0;
-:-:-:-:00 I2F.F32.U32.RZ rand1, rand1;
-:-:-:-:00 I2F.F32.U32.RZ rand2, rand2;
-:-:-:-:00 I2F.F32.U32.RZ rand3, rand3;

// Scale the random number so msb is one below lsb of fp16
// Add scaled random to number to round
-:-:-:-:00 FFMA.RZ c0, rand0, exp0, c0;
-:-:-:-:00 FFMA.RZ c1, rand1, exp1, c1;
-:-:-:-:00 FFMA.RZ c2, rand2, exp2, c2;
-:-:-:-:00 FFMA.RZ c3, rand3, exp3, c3;

// Truncate number to fp16
-:-:-:-:00 LOP.AND c0, c0, trunc_mask;
-:-:-:-:00 LOP.AND c1, c1, trunc_mask;
-:-:-:-:00 LOP.AND c2, c2, trunc_mask;
-:-:-:-:00 LOP.AND c3, c3, trunc_mask;
//</SCHEDULE_BLOCK>

END_ROUND2:

-:-:-:-:00 ISETP.NE.AND P6, PT, beta, RZ, PT; // beta != 0

-:-:-:-:00 @P0 ST.E.CG [C08y0 + 4x<00>], c0;
-:-:-:-:00 @P1 ST.E.CG [C08y0 + 4x<64>], c1;

// Stochastic Round flag
//-:-:-:-:00 LOP.AND.NZ P4, RZ, flags, 1;
//set P6 as false
-:-:-:-:00 ISETP.LT.AND P4, PT, RZ, RZ, !PT;

-:-:-:-:00 @P2 ST.E.CG [C12y0 + 4x<00>], c2;
-:-:-:-:00 @P3 ST.E.CG [C12y0 + 4x<64>], c3;

-:-:-:-:00 IADD C08y0.CC, C08y0, ldc1;
-:-:-:-:00 IADD.X C08y1, C08y1, RZ;
-:-:-:-:00 IADD C12y0.CC, C12y0, ldc1;
-:-:-:-:00 IADD.X C12y1, C12y1, RZ;

-:-:-:-:00 RET;
